import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import multiprocessing
# This ensures Plotly output works in multiple places:
# plotly_mimetype: VS Code notebook UI
# notebook: "Jupyter: Export to HTML" command in VS Code
# See https://plotly.com/python/renderers/#multiple-renderers
pio.renderers.default = "plotly_mimetype+notebook"
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('data_science_task_dataset.csv')
# For the purposes of this assignment, I did not want to touch the original df. So I'm making a copy of it.
df = data.copy()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 232319 entries, 0 to 232318 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 232319 non-null int64 1 signup_time 232319 non-null object 2 ages 232319 non-null object 3 genders 232319 non-null object 4 plans 232319 non-null object 5 plan_months 232319 non-null int64 6 payment_time 43338 non-null object 7 is_early_bird 232319 non-null bool 8 payment_amount 232319 non-null int64 dtypes: bool(1), int64(3), object(5) memory usage: 14.4+ MB
df.describe()
| Unnamed: 0 | plan_months | payment_amount | |
|---|---|---|---|
| count | 232319.000000 | 232319.000000 | 232319.000000 |
| mean | 116159.000000 | 9.007541 | 4358.553119 |
| std | 67064.862931 | 4.132480 | 12067.602454 |
| min | 0.000000 | 3.000000 | 0.000000 |
| 25% | 58079.500000 | 3.000000 | 0.000000 |
| 50% | 116159.000000 | 12.000000 | 0.000000 |
| 75% | 174238.500000 | 12.000000 | 0.000000 |
| max | 232318.000000 | 12.000000 | 223014.000000 |
df.sample(15)
| Unnamed: 0 | signup_time | ages | genders | plans | plan_months | payment_time | is_early_bird | payment_amount | |
|---|---|---|---|---|---|---|---|---|---|
| 130851 | 131597 | 2022-07-03 09:01:29 | 44 | MALE | PLUS | 12 | NaN | False | 0 |
| 105433 | 106046 | 2022-06-21 07:28:57 | 34, 18 | FEMALE, FEMALE | PLUS, PLUS | 12 | NaN | False | 0 |
| 56782 | 56456 | 2022-05-24 14:52:51 | 27, 2 | MALE, FEMALE | PLUS, PLUS | 12 | NaN | False | 0 |
| 162457 | 160826 | 2022-07-14 21:59:07 | 14 | FEMALE | PLUS | 12 | 2022-08-04 16:33:45 | False | 14714 |
| 7607 | 7618 | 2022-03-01 00:13:40 | 43 | MALE | PLUS | 12 | NaN | False | 0 |
| 223087 | 222257 | 2022-07-29 23:25:03 | 27 | FEMALE | LITE | 12 | NaN | False | 0 |
| 41872 | 42030 | 2022-05-09 04:31:42 | 36, 27 | MALE, FEMALE | PLUS, PLUS | 3 | NaN | False | 0 |
| 11568 | 11537 | 2022-03-16 11:49:00 | 42 | MALE | LITE | 12 | NaN | False | 0 |
| 184195 | 185350 | 2022-07-21 03:16:33 | 29, 32 | MALE, MALE | PLUS, PLUS | 12 | NaN | False | 0 |
| 79608 | 76837 | 2022-06-05 18:24:29 | 25 | FEMALE | PLUS | 3 | NaN | False | 0 |
| 80737 | 80765 | 2022-06-06 05:10:01 | 46 | FEMALE | LITE | 12 | NaN | False | 0 |
| 160165 | 159671 | 2022-07-13 21:30:35 | 30 | MALE | PLUS | 12 | NaN | False | 0 |
| 80878 | 81451 | 2022-06-06 07:57:50 | 29, 5, 47 | MALE, FEMALE, MALE | PLUS, PLUS, PLUS | 12 | NaN | False | 0 |
| 8723 | 8750 | 2022-03-05 17:35:47 | 24, 34 | MALE, MALE | PLUS, LITE | 12 | NaN | False | 0 |
| 804 | 839 | 2022-01-11 08:34:28 | 44 | FEMALE | PLUS | 12 | NaN | False | 0 |
After playing around with the data a bit (please leave in a rough "Exploration" section showing how you did that :)), what do you think are 2 crucial data breakdowns or plots to be shown if you were presenting this data to the wider team? If you think multiple options are possible, feel free to say why you picked those 2
df.drop(columns=['Unnamed: 0'], inplace = True)
df.head(2)
| signup_time | ages | genders | plans | plan_months | payment_time | is_early_bird | payment_amount | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2022-01-01 00:05:27 | 45 | FEMALE | PLUS | 12 | NaN | False | 0 |
| 1 | 2022-01-01 00:06:44 | 42 | MALE | PLUS | 12 | NaN | False | 0 |
df['payment_complete'] = df['payment_amount'].apply(lambda x: 'No' if x == 0 else 'Yes')
df['number_of_people'] = df['ages'].apply(lambda x: len(x.split(', ')))
df['number_of_male'] = df['genders'].apply(lambda x: len([val for val in x.split(', ') if val == 'MALE']))
df['number_of_female'] = df['genders'].apply(lambda x: len([val for val in x.split(', ') if val == 'FEMALE']))
df['number_of_kids'] = df['ages'].apply(lambda x: len([x for x in x.split(', ') if int(x) <= 18]))
# df['number_of_adults'] = df['ages'].apply(lambda x: len([x for x in x.split(', ') if ((int(x) > 18) & (int(x) < 60))]))
# df['number_of_elderly'] = df['ages'].apply(lambda x: len([x for x in x.split(', ') if int(x) >= 60]))
df['number_of_adults'] = df['ages'].apply(lambda x: len([x for x in x.split(', ') if int(x) > 18]))
df_paid = df.loc[df['payment_complete'] == 'Yes']
df_unpaid = df.loc[df['payment_complete'] == 'No']
plan_counts = df['plans'].str.split(', ').explode().value_counts()
fig_plan = px.bar(x=plan_counts.index, y=plan_counts.values, labels={'x': 'Plan', 'y': 'Count'}, title='Plan Distribution')
fig_plan.update_traces(text=plan_counts.values, textposition='inside') # Add count values to the bars
fig_plan.update_layout(
template='plotly_dark', # Apply dark theme
font_color='white', # Set font color to white for better visibility
)
fig_plan.show()
plan_counts_paid = df_paid['plans'].str.split(', ').explode().value_counts()
plan_counts_unpaid = df_unpaid['plans'].str.split(', ').explode().value_counts()
total_paid = plan_counts_paid.sum()
total_unpaid = plan_counts_unpaid.sum()
total_count = total_paid + total_unpaid
percentage_plus_paid = (plan_counts_paid['PLUS'] / total_count) * 100
percentage_plus_unpaid = (plan_counts_unpaid['PLUS'] / total_count) * 100
percentage_lite_paid = (plan_counts_paid['LITE'] / total_count) * 100
percentage_lite_unpaid = (plan_counts_unpaid['LITE'] / total_count) * 100
fig_plan = go.Figure()
fig_plan.add_trace(go.Bar(
x=['PLUS (Paid)', 'LITE (Paid)'],
y=[percentage_plus_paid, percentage_lite_paid],
text=[f"{percentage_plus_paid:.2f}%", f"{percentage_lite_paid:.2f}%"],
textposition='auto',
name="Paid Plan"
))
fig_plan.add_trace(go.Bar(
x=['PLUS (Unpaid)', 'LITE (Unpaid)'],
y=[percentage_plus_unpaid, percentage_lite_unpaid],
text=[f"{percentage_plus_unpaid:.2f}%", f"{percentage_lite_unpaid:.2f}%"],
textposition='auto',
name="Unpaid Plan"
))
fig_plan.update_layout(
title='Plan distribution between Paid and Not Paid users',
xaxis_title='Plan',
yaxis_title='Percentage',
template='plotly_dark',
font=dict(color='white'),
yaxis_tickformat='%'
)
fig_plan.show()
total_paid = plan_counts_paid.sum()
total_unpaid = plan_counts_unpaid.sum()
percentage_plus_paid = (plan_counts_paid['PLUS'] / total_paid) * 100
percentage_plus_unpaid = (plan_counts_unpaid['PLUS'] / total_unpaid) * 100
percentage_lite_paid = (plan_counts_paid['LITE'] / total_paid) * 100
percentage_lite_unpaid = (plan_counts_unpaid['LITE'] / total_unpaid) * 100
fig_plan = go.Figure()
fig_plan.add_trace(go.Bar(
x=['PLUS (Paid)', 'LITE (Paid)'],
y=[percentage_plus_paid, percentage_lite_paid],
text=[f"{percentage_plus_paid:.2f}%", f"{percentage_lite_paid:.2f}%"],
textposition='auto',
name="Paid Users"
))
fig_plan.add_trace(go.Bar(
x=['PLUS (Unpaid)', 'LITE (Unpaid)'],
y=[percentage_plus_unpaid, percentage_lite_unpaid],
text=[f"{percentage_plus_unpaid:.2f}%", f"{percentage_lite_unpaid:.2f}%"],
textposition='auto',
name="Unpaid Users"
))
fig_plan.update_layout(
title='Plan distribution between Paid and Not Paid users',
xaxis_title='Plan',
yaxis_title='Percentage',
template='plotly_dark',
font=dict(color='white'),
# yaxis_tickformat='%'
)
fig_plan.show()
payment_status = ['Paid', 'Not Paid']
payment_counts = [len(df.dropna(subset=['payment_time'])), len(df[df['payment_time'].isna()])]
fig_payment = px.pie(names=payment_status, values=payment_counts, title='Payment Completion Status')
fig_payment.update_layout(
template='plotly_dark', # Apply dark theme
font_color='white', # Set font color to white for better visibility
)
fig_payment.show()
import plotly.graph_objects as go
paid_users = len(df.dropna(subset=['payment_time']))
unpaid_users = len(df[df['payment_time'].isna()])
paid_plus_users = len(df[(df['payment_complete'] == 'Yes') & (df['plans'].str.contains('PLUS'))])
paid_lite_users = len(df[(df['payment_complete'] == 'Yes') & (df['plans'].str.contains('LITE'))])
unpaid_plus_users = len(df[(df['payment_complete'] == 'No') & (df['plans'].str.contains('PLUS'))])
unpaid_lite_users = len(df[(df['payment_complete'] == 'No') & (df['plans'].str.contains('LITE'))])
fig_payment_info = go.Figure(data=[go.Pie(
labels=['Paid PLUS', 'Paid LITE', 'Not Paid PLUS', 'Not Paid LITE'],
values=[paid_plus_users, paid_lite_users, unpaid_plus_users, unpaid_lite_users],
textinfo='label+percent',
hole=0,
marker=dict(colors=['#FF7F0E', '#1F77B4', '#FFC700', '#FF3860'],
line=dict(color='#FFFFFF', width=1)),
sort=False
)])
fig_payment_info.update_layout(
title='Payment and Plan Information Distribution',
template='plotly_dark',
font=dict(color='white'),
showlegend=False
)
fig_payment_info.show()
df['number_of_kids'] = df['ages'].apply(lambda x: len([x for x in x.split(', ') if int(x) <= 18]))
# df['number_of_adults'] = df['ages'].apply(lambda x: len([x for x in x.split(', ') if ((int(x) > 18) & (int(x) < 60))]))
# df['number_of_elderly'] = df['ages'].apply(lambda x: len([x for x in x.split(', ') if int(x) >= 60]))
df['number_of_adults'] = df['ages'].apply(lambda x: len([x for x in x.split(', ') if int(x) > 18]))
sum_kids = df['number_of_kids'].sum()
sum_adults = df['number_of_adults'].sum()
sum_total = sum_kids + sum_adults
percent_kids = (sum_kids / sum_total) * 100
percent_adults = (sum_adults / sum_total) * 100
age_dict = {'Adults': [percent_adults], 'Children': [percent_kids]}
age_group_counts = pd.DataFrame(age_dict).T
age_group_counts = pd.DataFrame(age_group_counts).reset_index(drop = False)
age_group_counts.rename(columns = {'index': 'age_group', 0: 'count'}, inplace = True)
fig_age_group = px.bar(x=age_group_counts['age_group'], y=age_group_counts['count'],
labels={'x': 'Age Group', 'y': 'Count'}, title='Age Group Distribution')
fig_age_group.update_traces(texttemplate='%{y}', textposition='inside')
fig_age_group.update_layout(yaxis=dict(automargin=True))
fig_age_group.update_layout(
template='plotly_dark', # Apply dark theme
font_color='white', # Set font color to white for better visibility
)
fig_age_group.show()
import plotly.express as px
fig_early_hist = px.histogram(df, x='is_early_bird', color='payment_complete', barmode='group',
title='Payment Completion by Early Bird Status', labels={'is_early_bird': 'Is Early Bird', 'count': 'Count'})
fig_early_hist.update_layout(showlegend=True)
fig_early_hist.update_traces(texttemplate='%{y}', textposition='auto')
fig_early_hist.update_layout(
template='plotly_dark', # Apply dark theme
font_color='white', # Set font color to white for better visibility
)
fig_early_hist.show()
df_paid = df.loc[df['payment_complete'] == 'Yes']
df_paid['signup_time'] = pd.to_datetime(df_paid['signup_time'])
df_paid['payment_time'] = pd.to_datetime(df_paid['payment_time'])
# Calculate the time deltas in minutes
df_paid['time_delta'] = (df_paid['payment_time'] - df_paid['signup_time']).dt.total_seconds() / 60
Also, the code below plots for "Number of kids and adults across the users who paid.". To save memory, the code has been commented.
# sum_kids = df_paid['number_of_kids'].sum()
# sum_adults = df_paid['number_of_adults'].sum()
# age_dict = {'Adults': [sum_adults], 'Children': [sum_kids]}
# age_group_counts = pd.DataFrame(age_dict).T
# age_group_counts = pd.DataFrame(age_group_counts).reset_index(drop = False)
# age_group_counts.rename(columns = {'index': 'age_group', 0: 'count'}, inplace = True)
# fig_age_group = px.bar(x=age_group_counts['age_group'], y=age_group_counts['count'],
# labels={'x': 'Age Group', 'y': 'Count'}, title='Age Group Distribution')
# fig_age_group.update_traces(texttemplate='%{y}', textposition='inside')
# fig_age_group.update_layout(yaxis=dict(automargin=True))
# fig_age_group.update_layout(
# template='plotly_dark', # Apply dark theme
# font_color='white', # Set font color to white for better visibility
# )
# fig_age_group.show()
def get_individual_user_df(df: pd.DataFrame) -> pd.DataFrame:
signup_time_arr = np.repeat(df['signup_time'], df['number_of_people'])
ages_arr = np.concatenate(df['ages'].str.split(', ').values)
genders_arr = np.concatenate(df['genders'].str.split(', ').values)
plans_arr = np.concatenate(df['plans'].str.split(', ').values)
plan_months_arr = np.repeat(df['plan_months'], df['number_of_people'])
payment_time_arr = np.repeat(df['payment_time'], df['number_of_people'])
is_early_bird_arr = np.repeat(df['is_early_bird'], df['number_of_people'])
payment_amount_arr = np.repeat(df['payment_amount'], df['number_of_people'])
payment_complete_arr = np.repeat(df['payment_complete'], df['number_of_people'])
number_of_people_arr = np.repeat(df['number_of_people'], df['number_of_people'])
set_arr = np.repeat(df.index, df['number_of_people'])
age_arr = ages_arr.astype(int)
age_group_arr = ['Adult' if age > 18 else "Child" for age in age_arr]
new_data = pd.DataFrame({
'signup_time': signup_time_arr,
'age': age_arr,
'age_group': age_group_arr,
'gender': genders_arr,
'plans': plans_arr,
'plan_months': plan_months_arr,
'payment_time': payment_time_arr,
'is_early_bird': is_early_bird_arr,
'payment_amount': payment_amount_arr,
'payment_complete': payment_complete_arr,
'number_of_people': number_of_people_arr,
'set': set_arr
})
return new_data
new_data = get_individual_user_df(df)
new_data.head(15)
| signup_time | age | age_group | gender | plans | plan_months | payment_time | is_early_bird | payment_amount | payment_complete | number_of_people | set | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2022-01-01 00:05:27 | 45 | Adult | FEMALE | PLUS | 12 | NaN | False | 0 | No | 1 | 0 |
| 1 | 2022-01-01 00:06:44 | 42 | Adult | MALE | PLUS | 12 | NaN | False | 0 | No | 1 | 1 |
| 2 | 2022-01-01 00:17:55 | 52 | Adult | MALE | PLUS | 12 | NaN | False | 0 | No | 1 | 2 |
| 3 | 2022-01-01 00:20:45 | 48 | Adult | MALE | PLUS | 12 | NaN | False | 0 | No | 1 | 3 |
| 4 | 2022-01-01 00:27:36 | 61 | Adult | MALE | LITE | 12 | NaN | False | 0 | No | 1 | 4 |
| 5 | 2022-01-01 00:30:20 | 52 | Adult | MALE | PLUS | 3 | NaN | False | 0 | No | 1 | 5 |
| 6 | 2022-01-01 00:36:45 | 32 | Adult | MALE | PLUS | 12 | NaN | False | 0 | No | 1 | 6 |
| 7 | 2022-01-01 01:00:05 | 32 | Adult | MALE | PLUS | 3 | NaN | False | 0 | No | 3 | 7 |
| 7 | 2022-01-01 01:00:05 | 40 | Adult | MALE | PLUS | 3 | NaN | False | 0 | No | 3 | 7 |
| 7 | 2022-01-01 01:00:05 | 41 | Adult | MALE | PLUS | 3 | NaN | False | 0 | No | 3 | 7 |
| 8 | 2022-01-01 01:18:33 | 30 | Adult | FEMALE | PLUS | 12 | NaN | False | 0 | No | 1 | 8 |
| 9 | 2022-01-01 01:51:53 | 57 | Adult | MALE | PLUS | 12 | NaN | False | 0 | No | 3 | 9 |
| 9 | 2022-01-01 01:51:53 | 44 | Adult | MALE | PLUS | 12 | NaN | False | 0 | No | 3 | 9 |
| 9 | 2022-01-01 01:51:53 | 36 | Adult | FEMALE | PLUS | 12 | NaN | False | 0 | No | 3 | 9 |
| 10 | 2022-01-01 02:25:56 | 30 | Adult | FEMALE | PLUS | 12 | NaN | False | 0 | No | 1 | 10 |
new_data.info()
<class 'pandas.core.frame.DataFrame'> Index: 346789 entries, 0 to 232318 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 signup_time 346789 non-null object 1 age 346789 non-null int64 2 age_group 346789 non-null object 3 gender 346789 non-null object 4 plans 346789 non-null object 5 plan_months 346789 non-null int64 6 payment_time 64761 non-null object 7 is_early_bird 346789 non-null bool 8 payment_amount 346789 non-null int64 9 payment_complete 346789 non-null object 10 number_of_people 346789 non-null int64 11 set 346789 non-null int64 dtypes: bool(1), int64(5), object(6) memory usage: 32.1+ MB
new_data.describe()
| age | plan_months | payment_amount | number_of_people | set | |
|---|---|---|---|---|---|
| count | 346789.000000 | 346789.000000 | 346789.000000 | 346789.000000 | 346789.000000 |
| mean | 33.681083 | 9.009161 | 5461.984544 | 1.982159 | 116070.569917 |
| std | 13.214198 | 4.130871 | 15190.756958 | 1.139260 | 67014.261209 |
| min | 0.000000 | 3.000000 | 0.000000 | 1.000000 | 0.000000 |
| 25% | 27.000000 | 3.000000 | 0.000000 | 1.000000 | 58100.000000 |
| 50% | 33.000000 | 12.000000 | 0.000000 | 2.000000 | 115980.000000 |
| 75% | 41.000000 | 12.000000 | 0.000000 | 3.000000 | 174032.000000 |
| max | 64.000000 | 12.000000 | 223014.000000 | 9.000000 | 232318.000000 |
num_sets = new_data['set'].nunique()
hist_data = new_data.groupby(['age_group', 'gender', 'payment_complete']).size().reset_index(name='count')
total_counts = hist_data.groupby(['age_group', 'gender'])['count'].transform('sum')
hist_data['percentage'] = (hist_data['count'] / total_counts) * 100
hist_data['percentage'] = hist_data['percentage'].round(1)
fig_hist = px.bar(hist_data, x='gender', y='percentage', color='payment_complete', barmode='group',
facet_col='age_group', facet_col_wrap=2,
title=f'Gender and Age Group Breakdown with Payment Completion (Unique Transactions: {num_sets})',
labels={'gender': 'Gender', 'percentage': 'Percentage', 'age_group': 'Age Group'},
text='percentage')
fig_hist.update_traces(texttemplate='%{text}%', textposition='inside')
fig_hist.update_layout(
template='plotly_dark', # Apply dark theme
font_color='white', # Set font color to white for better visibility
)
fig_hist.show()
Consider the fields signup_time and payment_time. They stand for the time a given user (who then may add multiple family members) signed up and then paid, respectively. In a single plot, how can you best show the distribution of time "deltas" between the sign up time and payment time (i.e. how long it takes for people to pay once they have signed up)? What is the best way to condense the relevant information and insights? Remember it needs to be a single, static plot, which ideally should not need to be magnified to make sense.
# fig_box = px.box(df_paid, y='time_delta', title='Distribution of Time Deltas: Sign-up to Payment',
# labels={'time_delta': 'Time Delta (minutes)'})
# fig_box.update_layout(
# template='plotly_dark', # Apply dark theme
# font_color='white', # Set font color to white for better visibility
# )
# fig_box.show()
fig_violin = go.Figure(data=go.Violin(y=df_paid['time_delta'], box_visible=True, meanline_visible=True, name='Violin'))
fig_violin.update_layout(yaxis_title='Time Delta (minutes)', xaxis_title='Distribution', title='Distribution of Time Deltas: Sign-up to Payment')
fig_violin.update_layout(
template='plotly_dark', # Apply dark theme
font_color='white', # Set font color to white for better visibility
width = 1000,
height = 900
)
fig_violin.show()
color_palette = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
fig_box = px.box(df_paid.sort_values(['number_of_people']), x='number_of_people', y='time_delta', title='Time Delta Distribution by Number of People',
labels={'number_of_people': 'Number of People', 'time_delta': 'Time Delta'},
color='number_of_people', color_discrete_sequence=color_palette)
fig_box.update_traces(quartilemethod="exclusive")
fig_box.update_layout(
template='plotly_dark', # Apply dark theme
font_color='white', # Set font color to white for better visibility
)
fig_box.show()
color_palette = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
fig_box = px.box(df_paid.sort_values(['number_of_kids', 'number_of_adults']), x='number_of_adults', y='time_delta',
title='Time Delta Distribution by Number of Adults separated by Number of Kids',
labels={'number_of_adults': 'Number of Adults', 'time_delta': 'Time Delta'},
color='number_of_kids', color_discrete_sequence=color_palette)
fig_box.update_traces(quartilemethod="exclusive")
fig_box.update_layout(
yaxis=dict(title=dict(text='Time Delta')),
margin=dict(t=100),
template='plotly_dark', # Apply dark theme
font_color='white', # Set font color to white for better visibility
)
fig_box.show()
import pandas as pd
import plotly.graph_objects as go
df_paid['payment_amount_bins'] = pd.cut(df_paid['payment_amount'], bins=10, precision=2, include_lowest=True).astype(str)
grouped_data = df_paid.groupby('payment_amount_bins')['time_delta'].mean().reset_index()
grouped_data['payment_amount_bins'] = grouped_data['payment_amount_bins'].str.replace(r'\(|\]', '', regex=True).str.split(', ')
grouped_data['payment_amount_bins'] = grouped_data['payment_amount_bins'].apply(lambda x: float(x[0]))
grouped_data = grouped_data.sort_values('payment_amount_bins')
fig_bar = go.Figure()
fig_bar.add_trace(go.Bar(
x=grouped_data['payment_amount_bins'],
y=grouped_data['time_delta'],
marker_color='blue',
))
fig_bar.update_layout(
title='Average Time Delta by Binned Payment Amount',
xaxis_title='Payment Amount',
yaxis_title='Average Time Delta',
template='plotly_dark',
font_color='white',
width=800,
height=600
)
fig_bar.show()
Since it needs to be single, static plot to show the distribution of the time delta between registration time and payment time, I'd go for a violin plot. (Cell 23) It tells us quite a few things about the time_delta:
From the Violin plot we can infer the following:
You are given the payment amounts but you don't know what the underlying price function is, and what its inputs are (though you can assume they are a subset of the given fields). If you had to treat this as a prediction problem, what kind of model would you use? PLEASE DO NOT ACTUALLY ATTEMPT MODELLING. Base your answer on any data exploration you did (and feel free to show plots/stats), but what we are looking is simply a discussion of what may be some of the modelling challenges here and how to pick a model which can overcome them.
### Selecting certain columns only
df_paid_selected = df_paid[['plan_months', 'is_early_bird', 'payment_amount', 'number_of_people', 'number_of_male', 'number_of_female',
'number_of_kids', 'number_of_adults']]
grouped_data = df_paid_selected.groupby(['number_of_people', 'plan_months']).mean()['payment_amount'].reset_index()
fig_payment_amount = go.Figure()
for months in grouped_data['plan_months'].unique():
data_filtered = grouped_data[grouped_data['plan_months'] == months]
fig_payment_amount.add_trace(go.Bar(
x=data_filtered['number_of_people'],
y=data_filtered['payment_amount'],
name=f'{months} Months'
))
fig_payment_amount.update_layout(
title='Number of People and Number of Months vs Payment Amount (Paid Users)',
xaxis_title='Number of People',
yaxis_title='Payment Amount'
)
fig_payment_amount.update_layout(
template='plotly_dark', # Apply dark theme
font_color='white', # Set font color to white for better visibility
)
fig_payment_amount.show()
import plotly.graph_objects as go
grouped_df = df_paid_selected.groupby(['number_of_kids', 'number_of_adults']).mean().reset_index()
fig_kid_adult = go.Figure(data=[
go.Bar(name='Payment Amount', x=grouped_df.index, y=grouped_df['payment_amount'])
])
fig_kid_adult.update_layout(
title='Payment Amount vs Number of Kids and Adults',
xaxis=dict(
tickmode='array',
tickvals=grouped_df.index,
ticktext=[f"Kids: {kids}, Adults: {adults}" for kids, adults in zip(grouped_df['number_of_kids'], grouped_df['number_of_adults'])]
),
yaxis_title='Payment Amount'
)
fig_kid_adult.update_layout(
template='plotly_dark', # Apply dark theme
font_color='white', # Set font color to white for better visibility
)
fig_kid_adult.show()
import plotly.graph_objects as go
grouped_df = df_paid_selected.groupby('payment_amount')[['number_of_male', 'number_of_female']].sum().reset_index()
payment_amount = grouped_df['payment_amount']
number_of_male = grouped_df['number_of_male']
number_of_female = grouped_df['number_of_female']
fig_payment_gender = go.Figure()
fig_payment_gender.add_trace(go.Scatter(
x=payment_amount,
y=number_of_male,
mode='markers',
name='Number of Males',
marker=dict(
size=10,
color='blue',
symbol='circle'
)
))
fig_payment_gender.add_trace(go.Scatter(
x=payment_amount,
y=number_of_female,
mode='markers',
name='Number of Females',
marker=dict(
size=10,
color='pink',
symbol='circle'
)
))
fig_payment_gender.update_layout(
title='Payment Amount vs Number of Males and Females (Paid Users)',
xaxis_title='Payment Amount',
yaxis_title='Number of People'
)
fig_payment_gender.update_layout(
template='plotly_dark', # Apply dark theme
font_color='white', # Set font color to white for better visibility
)
fig_payment_gender.show()
grouped_df = df_paid_selected.groupby(['number_of_people', 'is_early_bird']).mean().reset_index()
fig_people_early = go.Figure()
for num_people in grouped_df['number_of_people'].unique():
filtered_df = grouped_df[grouped_df['number_of_people'] == num_people]
fig_people_early.add_trace(go.Bar(
name=f'Number of People: {num_people}',
x=filtered_df['is_early_bird'],
y=filtered_df['payment_amount']
))
fig_people_early.update_layout(
title='Payment Amount vs Number of People and Early Bird',
xaxis_title='Is Early Bird',
yaxis_title='Payment Amount',
barmode='group',
template='plotly_dark', # Apply dark theme
font_color='white', # Set font color to white for better visibility
)
fig_people_early.show()
Armed with this knowledge we can work towards thinking of a model that might fit the data. Ideally, I'd use a whole bunch of models on a subset of the data to see how everything performs after feature engineering a few things like price for adult, price for kid, price for adult with early pass, price for kid with early pass (This requires us to know the pricing formula or we could come up with our own by splitting families and checking the prices by using new_data)
The model that I think would work is a XGBoost Regressor.
As XGBoost can work on smaller datasets and can handle imbalance, less number of features, and can handle outliers well while being flexible enough to take both categorical and numeric features, I'd pick XGBoost as my choice of model to predict payment amount using the features we have and the ones we engineered.